Data Science and Machine Learning Using Python Capstone Project¶


Guidelines

  • Using the dataset provided, create a binary classification.
  • Perform the following:
    • Perform an exploratory analysis (scaling, PCA, unbalanced)
    • Split the data (train, validation, test)
    • Perform 10-fold cross-validation and grid search (method of your choice)
    • Compare different classification methods (Logistic Regression, KNN, SVM, RF, XGBOOST)
    • Show evaluation metrics (ROC-AUC, accuracy, f-1 score)
In [ ]:
import os
import pickle
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
from typing import Union, Type, TypeVar
from typing_extensions import Self
from matplotlib import pyplot as plt
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, PolynomialFeatures

%config InlineBackend.figure_format = "retina"
%matplotlib inline
warnings.simplefilter('ignore')
In [ ]:
color_a1 = '#664c85'
color_a2 = '#5b92b6'
color_a3 = '#52d5bd'
color_a4 = '#eeeeee'
palette_a1 = 'mako'
palette_a2 = 'mako_r'

Initialization¶

Loading the given dataset¶

Initial look on the buffered dataset

In [ ]:
pd.read_csv('data/SP901_CS_completedata.csv').head()
Out[ ]:
PatientID;Failure.binary;Entropy_cooc.W.ADC;GLNU_align.H.PET;Min_hist.PET;Max_hist.PET;Mean_hist.PET;Variance_hist.PET;Standard_Deviation_hist.PET;Skewness_hist.PET;Kurtosis_hist.PET;Energy_hist.PET;Entropy_hist.PET;AUC_hist.PET;H_suv.PET;Volume.PET;X3D_surface.PET;ratio_3ds_vol.PET;ratio_3ds_vol_norm.PET;irregularity.PET;tumor_length.PET;Compactness_v1.PET;Compactness_v2.PET;Spherical_disproportion.PET;Sphericity.PET;Asphericity.PET;Center_of_mass.PET;Max_3D_diam.PET;Major_axis_length.PET;Minor_axis_length.PET;Least_axis_length.PET;Elongation.PET;Flatness.PET;Max_cooc.L.PET;Average_cooc.L.PET;Variance_cooc.L.PET;Entropy_cooc.L.PET;DAVE_cooc.L.PET;DVAR_cooc.L.PET;DENT_cooc.L.PET;SAVE_cooc.L.PET;SVAR_cooc.L.PET;SENT_cooc.L.PET;ASM_cooc.L.PET;Contrast_cooc.L.PET;Dissimilarity_cooc.L.PET;Inv_diff_cooc.L.PET;Inv_diff_norm_cooc.L.PET;IDM_cooc.L.PET;IDM_norm_cooc.L.PET;Inv_var_cooc.L.PET;Correlation_cooc.L.PET;Autocorrelation_cooc.L.PET;Tendency_cooc.L.PET;Shade_cooc.L.PET;Prominence_cooc.L.PET;IC1_.L.PET;IC2_.L.PET;Coarseness_vdif_.L.PET;Contrast_vdif_.L.PET;Busyness_vdif_.L.PET;Complexity_vdif_.L.PET;Strength_vdif_.L.PET;SRE_align.L.PET;LRE_align.L.PET;GLNU_align.L.PET;RLNU_align.L.PET;RP_align.L.PET;LGRE_align.L.PET;HGRE_align.L.PET;LGSRE_align.L.PET;HGSRE_align.L.PET;LGHRE_align.L.PET;HGLRE_align.L.PET;GLNU_norm_align.L.PET;RLNU_norm_align.L.PET;GLVAR_align.L.PET;RLVAR_align.L.PET;Entropy_align.L.PET;SZSE.L.PET;LZSE.L.PET;LGLZE.L.PET;HGLZE.L.PET;SZLGE.L.PET;SZHGE.L.PET;LZLGE.L.PET;LZHGE.L.PET;GLNU_area.L.PET;ZSNU.L.PET;ZSP.L.PET;GLNU_norm.L.PET;ZSNU_norm.L.PET;GLVAR_area.L.PET;ZSVAR.L.PET;Entropy_area.L.PET;Max_cooc.H.PET;Average_cooc.H.PET;Variance_cooc.H.PET;Entropy_cooc.H.PET;DAVE_cooc.H.PET;DVAR_cooc.H.PET;DENT_cooc.H.PET;SAVE_cooc.H.PET;SVAR_cooc.H.PET;SENT_cooc.H.PET;ASM_cooc.H.PET;Contrast_cooc.H.PET;Dissimilarity_cooc.H.PET;Inv_diff_cooc.H.PET;Inv_diff_norm_cooc.H.PET;IDM_cooc.H.PET;IDM_norm_cooc.H.PET;Inv_var_cooc_.H.PET;Correlation_cooc.H.PET;Autocorrelation_cooc.H.PET;Tendency_cooc.H.PET;Shade_cooc.H.PET;Prominence_cooc.H.PET;IC1_d.H.PET;IC2_d.H.PET;Coarseness_vdif.H.PET;Contrast_vdif.H.PET;Busyness_vdif.H.PET;Complexity_vdif.H.PET;Strength_vdif.H.PET;SRE_align.H.PET;LRE_align.H.PET;RLNU_align.H.PET;RP_align.H.PET;LGRE_align.H.PET;HGRE_align.H.PET;LGSRE_align.H.PET;HGSRE_align.H.PET;LGHRE_align.H.PET;HGLRE_align.H.PET;GLNU_norm_align.H.PET;RLNU_norm_align.H.PET;GLVAR_align.H.PET;RLVAR_align.H.PET;Entropy_align.H.PET;SZSE.H.PET;LZSE.H.PET;LGLZE.H.PET;HGLZE.H.PET;SZLGE.H.PET;SZHGE.H.PET;LZLGE.H.PET;LZHGE.H.PET;GLNU_area.H.PET;ZSNU.H.PET;ZSP.H.PET;GLNU_norm.H.PET;ZSNU_norm.H.PET;GLVAR_area.H.PET;ZSVAR_H.PET;Entropy_area.H.PET;Max_cooc.W.PET;Average_cooc.W.PET;Variance_cooc.W.PET;Entropy_cooc.W.PET;DAVE_cooc.W.PET;DVAR_cooc.W.PET;DENT_cooc.W.PET;SAVE_cooc.W.PET;SVAR_cooc.W.PET;SENT_cooc.W.PET;ASM_cooc.W.PET;Contrast_cooc.W.PET;Dissimilarity_cooc.W.PET;Inv_diff_cooc.W.PET;Inv_diff_norm_cooc.W.PET;IDM_cooc.W.PET;IDM_norm_cooc.W.PET;Inv_var_cooc.W.PET;Correlation_cooc.W.PET;Autocorrelation_cooc.W.PET;Tendency_cooc.W.PET;Shade_cooc.W.PET;Prominence_cooc.W.PET;IC1_d.W.PET;IC2_d.W.PET;Coarseness_vdif.W.PET;Contrast_vdif.W.PET;Busyness_vdif.W.PET;Complexity_vdif.W.PET;Strength_vdif.W.PET;SRE_align.W.PET;LRE_align.W.PET;GLNU_align.W.PET;RLNU_align.W.PET;RP_align.W.PET;LGRE_align.W.PET;HGRE_align.W.PET;LGSRE_align.W.PET;HGSRE_align.W.PET;LGHRE_align.W.PET;HGLRE_align.W.PET;GLNU_norm_align.W.PET;RLNU_norm_align.W.PET;GLVAR_align.W.PET;RLVAR_align.W.PET;Entropy_align.W.PET;SZSE.W.PET;LZSE.W.PET;LGLZE.W.PET;HGLZE.W.PET;SZLGE.W.PET;SZHGE.W.PET;LZLGE.W.PET;LZHGE.W.PET;GLNU_area.W.PET;ZSNU.W.PET;ZSP.W.PET;GLNU_norm.W.PET;ZSNU_norm.W.PET;GLVAR_area.W.PET;ZSVAR.W.PET;Entropy_area.W.PET;Min_hist.ADC;Max_hist.ADC;Mean_hist.ADC;Variance_hist.ADC;Standard_Deviation_hist.ADC;Skewness_hist.ADC;Kurtosis_hist.ADC;Energy_hist.ADC;Entropy_hist.ADC;AUC_hist.ADC;Volume.ADC;X3D_surface.ADC;ratio_3ds_vol.ADC;ratio_3ds_vol_norm.ADC;irregularity.ADC;Compactness_v1.ADC;Compactness_v2.ADC;Spherical_disproportion.ADC;Sphericity.ADC;Asphericity.ADC;Center_of_mass.ADC;Max_3D_diam.ADC;Major_axis_length.ADC;Minor_axis_length.ADC;Least_axis_length.ADC;Elongation.ADC;Flatness.ADC;Max_cooc.L.ADC;Average_cooc.L.ADC;Variance_cooc.L.ADC;Entropy_cooc.L.ADC;DAVE_cooc.L.ADC;DVAR_cooc.L.ADC;DENT_cooc.L.ADC;SAVE_cooc.L.ADC;SVAR_cooc.L.ADC;SENT_cooc.L.ADC;ASM_cooc.L.ADC;Contrast_cooc.L.ADC;Dissimilarity_cooc.L.ADC;Inv_diff_cooc.L.ADC;Inv_diff_norm_cooc.L.ADC;IDM_cooc.L.ADC;IDM_norm_cooc.L.ADC;Inv_var_cooc.L.ADC;Correlation_cooc.L.ADC;Autocorrelation_.L.ADC;Tendency_cooc.L.ADC;Shade_.L.ADC;Prominence_cooc.L.ADC;IC1_.L.ADC;IC2_.L.ADC;Coarseness_vdif_.L.ADC;Contrast_vdif_.L.ADC;Busyness_vdif_.L.ADC;Complexity_vdif_.L.ADC;Strength_vdif_.L.ADC;SRE_align.L.ADC;LRE_align.L.ADC;GLNU_align.L.ADC;RLNU_align.L.ADC;RP_align.L.ADC;LGRE_align.L.ADC;HGRE_align.L.ADC;LGSRE_align.L.ADC;HGSRE_align.L.ADC;LGHRE_align.L.ADC;HGLRE_align.L.ADC;GLNU_norm_align.L.ADC;RLNU_norm_align.L.ADC;GLVAR_align.L.ADC;RLVAR_align.L.ADC;Entropy_align.L.ADC;SZSE.L.ADC;LZSE.L.ADC;LGLZE.L.ADC;HGLZE.L.ADC;SZLGE.L.ADC;SZHGE.L.ADC;LZLGE.L.ADC;LZHGE.L.ADC;GLNU_area.L.ADC;ZSNU.L.ADC;ZSP.L.ADC;GLNU_norm.L.ADC;ZSNU_norm.L.ADC;GLVAR_area.L.ADC;ZSVAR.L.ADC;Entropy_area.L.ADC;Max_cooc.H.ADC;Average_cooc.H.ADC;Variance_cooc.H.ADC;Entropy_cooc.H.ADC;DAVE_cooc.H.ADC;DVAR_cooc.H.ADC;DENT_cooc.H.ADC;SAVE_cooc.H.ADC;SVAR_cooc.H.ADC;SENT_cooc.H.ADC;ASM_cooc.H.ADC;Contrast_cooc.H.ADC;Dissimilarity_cooc.H.ADC;Inv_diff_cooc.H.ADC;Inv_diff_norm_cooc.H.ADC;IDM_cooc.H.ADC;IDM_norm_cooc.H.ADC;Inv_var_cooc.H.ADC;Correlation_cooc.H.ADC;Autocorrelation_cooc.H.ADC;Tendency_cooc.H.ADC;Shade_cooc.H.ADC;Prominence_cooc.H.ADC;IC1_d.H.ADC;IC2_d.H.ADC;Coarseness_vdif.H.ADC;Contrast_vdif.H.ADC;Busyness_vdif.H.ADC;Complexity_vdif.H.ADC;Strength_vdif.H.ADC;SRE_align.H.ADC;LRE_align.H.ADC;GLNU_align.H.ADC;RLNU_align.H.ADC;RP_align.H.ADC;LGRE_align.H.ADC;HGRE_align.H.ADC;LGSRE_align.H.ADC;HGSRE_align.H.ADC;LGHRE_align.H.ADC;HGLRE_align.H.ADC;GLNU_norm_align.H.ADC;RLNU_norm_align.H.ADC;GLVAR_align.H.ADC;RLVAR_align.H.ADC;Entropy_align.H.ADC;SZSE.H.ADC;LZSE.H.ADC;LGLZE.H.ADC;HGLZE.H.ADC;SZLGE.H.ADC;SZHGE.H.ADC;LZLGE.H.ADC;LZHGE.H.ADC;GLNU_area.H.ADC;ZSNU.H.ADC;ZSP.H.ADC;GLNU_norm.H.ADC;ZSNU_norm.H.ADC;GLVAR_area.H.ADC;ZSVAR.H.ADC;Entropy_area.H.ADC;Max_cooc.W.ADC;Average_cooc.W.ADC;Variance_cooc.W.ADC;DAVE_cooc.W.ADC;DVAR_cooc.W.ADC;DENT_cooc.W.ADC;SAVE_cooc.W.ADC;SVAR_cooc.W.ADC;SENT_cooc.W.ADC;ASM_cooc.W.ADC;Contrast_cooc.W.ADC;Dissimilarity_cooc.W.ADC;Inv_diff_cooc.W.ADC;Inv_diff_norm_cooc.W.ADC;IDM_cooc.W.ADC;IDM_norm_cooc.W.ADC;Inv_var_cooc.W.ADC;Correlation_cooc.W.ADC;Autocorrelation_cooc.W.ADC;Tendency_cooc.W.ADC;Shade_cooc.W.ADC;Prominence_cooc.W.ADC;IC1_d.W.ADC;IC2_d.W.ADC;Coarseness_vdif.W.ADC;Contrast_vdif.W.ADC;Busyness_vdif.W.ADC;Complexity_vdif.W.ADC;Strength_vdif.W.ADC;SRE_align.W.ADC;LRE_align.W.ADC;GLNU_align.W.ADC;RLNU_align.W.ADC;RP_align.W.ADC;LGRE_align.W.ADC;HGRE_align.W.ADC;LGSRE_align.W.ADC;HGSRE_align.W.ADC;LGHRE_align.W.ADC;HGLRE_align.W.ADC;GLNU_norm_align.W.ADC;RLNU_norm_align.W.ADC;GLVAR_align.W.ADC;RLVAR_align.W.ADC;Entropy_align.W.ADC;SZSE.W.ADC;LZSE.W.ADC;LGLZE.W.ADC;HGLZE.W.ADC;SZLGE.W.ADC;SZHGE.W.ADC;LZLGE.W.ADC;LZHGE.W.ADC;GLNU_area.W.ADC;ZSNU.W.ADC;ZSP.W.ADC;GLNU_norm.W.ADC;ZSNU_norm.W.ADC;GLVAR_area.W.ADC;ZSVAR.W.ADC;Entropy_area.W.ADC
0 1;0;12.85352;46.256345;6.249117;17.825541;9.78...
1 2;1;12.21115;27.45454;11.005214;26.469077;15.4...
2 3;0;12.75682;90.195696;2.777718;6.877486;4.295...
3 4;1;13.4673;325.64333;6.296588;22.029843;10.33...
4 5;0;12.63733;89.579042;3.583846;7.922501;4.454...

With how the dataframe looks like, it seems to have a delimiter of ';' so we have to pass delimiter = ';'

In [ ]:
df = pd.read_csv('data/SP901_CS_completedata.csv', delimiter = ';', index_col = 'PatientID')

df.head(10)
Out[ ]:
Failure.binary Entropy_cooc.W.ADC GLNU_align.H.PET Min_hist.PET Max_hist.PET Mean_hist.PET Variance_hist.PET Standard_Deviation_hist.PET Skewness_hist.PET Kurtosis_hist.PET ... LZLGE.W.ADC LZHGE.W.ADC GLNU_area.W.ADC ZSNU.W.ADC ZSP.W.ADC GLNU_norm.W.ADC ZSNU_norm.W.ADC GLVAR_area.W.ADC ZSVAR.W.ADC Entropy_area.W.ADC
PatientID
1 0 12.85352 46.256345 6.249117 17.825541 9.783773 6.814365 2.612479 0.688533 -0.339727 ... 0.00690 6201.93480 4.13400 239.28938 0.97918 0.01899 0.95586 1145.10496 0.02586 6.28632
2 1 12.21115 27.454540 11.005214 26.469077 15.426640 12.932074 3.598298 0.789526 -0.319613 ... 0.00423 16054.01263 8.37627 644.73702 0.95637 0.01461 0.93288 847.52537 0.04153 6.77853
3 0 12.75682 90.195696 2.777718 6.877486 4.295330 0.923425 0.962163 0.248637 -0.944246 ... 0.00453 6674.63840 13.11686 1165.70261 0.97268 0.02501 0.91537 1923.85705 0.07104 7.15685
4 1 13.46730 325.643330 6.296588 22.029843 10.334779 6.649795 2.580759 0.832011 0.855861 ... 0.00888 17172.90951 23.84726 2760.41293 0.97203 0.01069 0.94658 1329.95290 0.03848 7.29521
5 0 12.63733 89.579042 3.583846 7.922501 4.454175 0.572094 0.757225 1.574845 3.250288 ... 0.00405 13231.94294 8.14437 784.59729 0.96469 0.02526 0.93769 1116.38669 0.05223 7.05149
6 1 13.16159 101.713446 2.597947 6.206142 3.769041 0.615282 0.785315 0.610611 -0.090239 ... 0.00377 22707.42806 5.93657 893.17913 0.97662 0.00884 0.95272 2743.23763 0.03055 7.54787
7 0 12.20341 36.798444 8.653594 28.223361 14.923360 17.700730 4.209453 0.839347 0.183203 ... 0.00483 19242.69402 4.73870 440.80678 0.96565 0.01261 0.94041 1261.05997 0.05213 6.96438
8 0 12.27549 50.997414 5.711431 12.998990 7.733934 2.604651 1.615639 0.909312 0.065658 ... 0.00380 16807.22779 10.34041 816.97750 0.96109 0.01429 0.93197 983.07375 0.05754 6.88756
9 1 13.36502 27.171292 5.879695 14.770986 9.116435 4.399354 2.099390 0.457283 -0.443650 ... 0.00638 15722.04342 3.77576 261.12149 0.98788 0.01017 0.97323 779.86641 0.01718 6.36800
10 1 12.64322 20.180627 5.695684 17.137587 8.545943 6.118426 2.475564 1.213924 1.243357 ... 0.00428 9429.90834 6.87608 719.03350 0.96407 0.01140 0.93031 2232.72933 0.04609 7.25058

10 rows × 429 columns

Rearranging the dataframe and placing the target column to the last index

In [ ]:
# Excluding the `Failure.binary` column and converting it to list
column_rearranged = list(df.columns[1:])

# Appending the `Failure.binary` column to the list
column_rearranged.append(df.columns[0])
In [ ]:
df = df[column_rearranged]

df.head(10)
Out[ ]:
Entropy_cooc.W.ADC GLNU_align.H.PET Min_hist.PET Max_hist.PET Mean_hist.PET Variance_hist.PET Standard_Deviation_hist.PET Skewness_hist.PET Kurtosis_hist.PET Energy_hist.PET ... LZHGE.W.ADC GLNU_area.W.ADC ZSNU.W.ADC ZSP.W.ADC GLNU_norm.W.ADC ZSNU_norm.W.ADC GLVAR_area.W.ADC ZSVAR.W.ADC Entropy_area.W.ADC Failure.binary
PatientID
1 12.85352 46.256345 6.249117 17.825541 9.783773 6.814365 2.612479 0.688533 -0.339727 0.005095 ... 6201.93480 4.13400 239.28938 0.97918 0.01899 0.95586 1145.10496 0.02586 6.28632 0
2 12.21115 27.454540 11.005214 26.469077 15.426640 12.932074 3.598298 0.789526 -0.319613 0.006297 ... 16054.01263 8.37627 644.73702 0.95637 0.01461 0.93288 847.52537 0.04153 6.77853 1
3 12.75682 90.195696 2.777718 6.877486 4.295330 0.923425 0.962163 0.248637 -0.944246 0.005015 ... 6674.63840 13.11686 1165.70261 0.97268 0.02501 0.91537 1923.85705 0.07104 7.15685 0
4 13.46730 325.643330 6.296588 22.029843 10.334779 6.649795 2.580759 0.832011 0.855861 0.003289 ... 17172.90951 23.84726 2760.41293 0.97203 0.01069 0.94658 1329.95290 0.03848 7.29521 1
5 12.63733 89.579042 3.583846 7.922501 4.454175 0.572094 0.757225 1.574845 3.250288 0.008066 ... 13231.94294 8.14437 784.59729 0.96469 0.02526 0.93769 1116.38669 0.05223 7.05149 0
6 13.16159 101.713446 2.597947 6.206142 3.769041 0.615282 0.785315 0.610611 -0.090239 0.005237 ... 22707.42806 5.93657 893.17913 0.97662 0.00884 0.95272 2743.23763 0.03055 7.54787 1
7 12.20341 36.798444 8.653594 28.223361 14.923360 17.700730 4.209453 0.839347 0.183203 0.004674 ... 19242.69402 4.73870 440.80678 0.96565 0.01261 0.94041 1261.05997 0.05213 6.96438 0
8 12.27549 50.997414 5.711431 12.998990 7.733934 2.604651 1.615639 0.909312 0.065658 0.006540 ... 16807.22779 10.34041 816.97750 0.96109 0.01429 0.93197 983.07375 0.05754 6.88756 0
9 13.36502 27.171292 5.879695 14.770986 9.116435 4.399354 2.099390 0.457283 -0.443650 0.007034 ... 15722.04342 3.77576 261.12149 0.98788 0.01017 0.97323 779.86641 0.01718 6.36800 1
10 12.64322 20.180627 5.695684 17.137587 8.545943 6.118426 2.475564 1.213924 1.243357 0.009571 ... 9429.90834 6.87608 719.03350 0.96407 0.01140 0.93031 2232.72933 0.04609 7.25058 1

10 rows × 429 columns

The dataset did not come with a schema information so we'll have to put more time on exploratory data analysis to explore, investigate, learn, and summarize the dataset's main characteristics.

Dataframe information¶

In [ ]:
df.shape
Out[ ]:
(197, 429)
In [ ]:
df.info(verbose = True, show_counts = True)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 197 entries, 1 to 197
Data columns (total 429 columns):
 #    Column                       Non-Null Count  Dtype  
---   ------                       --------------  -----  
 0    Entropy_cooc.W.ADC           197 non-null    float64
 1    GLNU_align.H.PET             197 non-null    float64
 2    Min_hist.PET                 197 non-null    float64
 3    Max_hist.PET                 197 non-null    float64
 4    Mean_hist.PET                197 non-null    float64
 5    Variance_hist.PET            197 non-null    float64
 6    Standard_Deviation_hist.PET  197 non-null    float64
 7    Skewness_hist.PET            197 non-null    float64
 8    Kurtosis_hist.PET            197 non-null    float64
 9    Energy_hist.PET              197 non-null    float64
 10   Entropy_hist.PET             197 non-null    float64
 11   AUC_hist.PET                 197 non-null    float64
 12   H_suv.PET                    197 non-null    float64
 13   Volume.PET                   197 non-null    float64
 14   X3D_surface.PET              197 non-null    float64
 15   ratio_3ds_vol.PET            197 non-null    float64
 16   ratio_3ds_vol_norm.PET       197 non-null    float64
 17   irregularity.PET             197 non-null    float64
 18   tumor_length.PET             197 non-null    float64
 19   Compactness_v1.PET           197 non-null    float64
 20   Compactness_v2.PET           197 non-null    float64
 21   Spherical_disproportion.PET  197 non-null    float64
 22   Sphericity.PET               197 non-null    float64
 23   Asphericity.PET              197 non-null    float64
 24   Center_of_mass.PET           197 non-null    float64
 25   Max_3D_diam.PET              197 non-null    float64
 26   Major_axis_length.PET        197 non-null    float64
 27   Minor_axis_length.PET        197 non-null    float64
 28   Least_axis_length.PET        197 non-null    float64
 29   Elongation.PET               197 non-null    float64
 30   Flatness.PET                 197 non-null    float64
 31   Max_cooc.L.PET               197 non-null    float64
 32   Average_cooc.L.PET           197 non-null    float64
 33   Variance_cooc.L.PET          197 non-null    float64
 34   Entropy_cooc.L.PET           197 non-null    float64
 35   DAVE_cooc.L.PET              197 non-null    float64
 36   DVAR_cooc.L.PET              197 non-null    float64
 37   DENT_cooc.L.PET              197 non-null    float64
 38   SAVE_cooc.L.PET              197 non-null    float64
 39   SVAR_cooc.L.PET              197 non-null    float64
 40   SENT_cooc.L.PET              197 non-null    float64
 41   ASM_cooc.L.PET               197 non-null    float64
 42   Contrast_cooc.L.PET          197 non-null    float64
 43   Dissimilarity_cooc.L.PET     197 non-null    float64
 44   Inv_diff_cooc.L.PET          197 non-null    float64
 45   Inv_diff_norm_cooc.L.PET     197 non-null    float64
 46   IDM_cooc.L.PET               197 non-null    float64
 47   IDM_norm_cooc.L.PET          197 non-null    float64
 48   Inv_var_cooc.L.PET           197 non-null    float64
 49   Correlation_cooc.L.PET       197 non-null    float64
 50   Autocorrelation_cooc.L.PET   197 non-null    float64
 51   Tendency_cooc.L.PET          197 non-null    float64
 52   Shade_cooc.L.PET             197 non-null    float64
 53   Prominence_cooc.L.PET        197 non-null    float64
 54   IC1_.L.PET                   197 non-null    float64
 55   IC2_.L.PET                   197 non-null    float64
 56   Coarseness_vdif_.L.PET       197 non-null    float64
 57   Contrast_vdif_.L.PET         197 non-null    float64
 58   Busyness_vdif_.L.PET         197 non-null    float64
 59   Complexity_vdif_.L.PET       197 non-null    float64
 60   Strength_vdif_.L.PET         197 non-null    float64
 61   SRE_align.L.PET              197 non-null    float64
 62   LRE_align.L.PET              197 non-null    float64
 63   GLNU_align.L.PET             197 non-null    float64
 64   RLNU_align.L.PET             197 non-null    float64
 65   RP_align.L.PET               197 non-null    float64
 66   LGRE_align.L.PET             197 non-null    float64
 67   HGRE_align.L.PET             197 non-null    float64
 68   LGSRE_align.L.PET            197 non-null    float64
 69   HGSRE_align.L.PET            197 non-null    float64
 70   LGHRE_align.L.PET            197 non-null    float64
 71   HGLRE_align.L.PET            197 non-null    float64
 72   GLNU_norm_align.L.PET        197 non-null    float64
 73   RLNU_norm_align.L.PET        197 non-null    float64
 74   GLVAR_align.L.PET            197 non-null    float64
 75   RLVAR_align.L.PET            197 non-null    float64
 76   Entropy_align.L.PET          197 non-null    float64
 77   SZSE.L.PET                   197 non-null    float64
 78   LZSE.L.PET                   197 non-null    float64
 79   LGLZE.L.PET                  197 non-null    float64
 80   HGLZE.L.PET                  197 non-null    float64
 81   SZLGE.L.PET                  197 non-null    float64
 82   SZHGE.L.PET                  197 non-null    float64
 83   LZLGE.L.PET                  197 non-null    float64
 84   LZHGE.L.PET                  197 non-null    float64
 85   GLNU_area.L.PET              197 non-null    float64
 86   ZSNU.L.PET                   197 non-null    float64
 87   ZSP.L.PET                    197 non-null    float64
 88   GLNU_norm.L.PET              197 non-null    float64
 89   ZSNU_norm.L.PET              197 non-null    float64
 90   GLVAR_area.L.PET             197 non-null    float64
 91   ZSVAR.L.PET                  197 non-null    float64
 92   Entropy_area.L.PET           197 non-null    float64
 93   Max_cooc.H.PET               197 non-null    float64
 94   Average_cooc.H.PET           197 non-null    float64
 95   Variance_cooc.H.PET          197 non-null    float64
 96   Entropy_cooc.H.PET           197 non-null    float64
 97   DAVE_cooc.H.PET              197 non-null    float64
 98   DVAR_cooc.H.PET              197 non-null    float64
 99   DENT_cooc.H.PET              197 non-null    float64
 100  SAVE_cooc.H.PET              197 non-null    float64
 101  SVAR_cooc.H.PET              197 non-null    float64
 102  SENT_cooc.H.PET              197 non-null    float64
 103  ASM_cooc.H.PET               197 non-null    float64
 104  Contrast_cooc.H.PET          197 non-null    float64
 105  Dissimilarity_cooc.H.PET     197 non-null    float64
 106  Inv_diff_cooc.H.PET          197 non-null    float64
 107  Inv_diff_norm_cooc.H.PET     197 non-null    float64
 108  IDM_cooc.H.PET               197 non-null    float64
 109  IDM_norm_cooc.H.PET          197 non-null    float64
 110  Inv_var_cooc_.H.PET          197 non-null    float64
 111  Correlation_cooc.H.PET       197 non-null    float64
 112  Autocorrelation_cooc.H.PET   197 non-null    float64
 113  Tendency_cooc.H.PET          197 non-null    float64
 114  Shade_cooc.H.PET             197 non-null    float64
 115  Prominence_cooc.H.PET        197 non-null    float64
 116  IC1_d.H.PET                  197 non-null    float64
 117  IC2_d.H.PET                  197 non-null    float64
 118  Coarseness_vdif.H.PET        197 non-null    float64
 119  Contrast_vdif.H.PET          197 non-null    float64
 120  Busyness_vdif.H.PET          197 non-null    float64
 121  Complexity_vdif.H.PET        197 non-null    float64
 122  Strength_vdif.H.PET          197 non-null    float64
 123  SRE_align.H.PET              197 non-null    float64
 124  LRE_align.H.PET              197 non-null    float64
 125  RLNU_align.H.PET             197 non-null    float64
 126  RP_align.H.PET               197 non-null    float64
 127  LGRE_align.H.PET             197 non-null    float64
 128  HGRE_align.H.PET             197 non-null    float64
 129  LGSRE_align.H.PET            197 non-null    float64
 130  HGSRE_align.H.PET            197 non-null    float64
 131  LGHRE_align.H.PET            197 non-null    float64
 132  HGLRE_align.H.PET            197 non-null    float64
 133  GLNU_norm_align.H.PET        197 non-null    float64
 134  RLNU_norm_align.H.PET        197 non-null    float64
 135  GLVAR_align.H.PET            197 non-null    float64
 136  RLVAR_align.H.PET            197 non-null    float64
 137  Entropy_align.H.PET          197 non-null    float64
 138  SZSE.H.PET                   197 non-null    float64
 139  LZSE.H.PET                   197 non-null    float64
 140  LGLZE.H.PET                  197 non-null    float64
 141  HGLZE.H.PET                  197 non-null    float64
 142  SZLGE.H.PET                  197 non-null    float64
 143  SZHGE.H.PET                  197 non-null    float64
 144  LZLGE.H.PET                  197 non-null    float64
 145  LZHGE.H.PET                  197 non-null    float64
 146  GLNU_area.H.PET              197 non-null    float64
 147  ZSNU.H.PET                   197 non-null    float64
 148  ZSP.H.PET                    197 non-null    float64
 149  GLNU_norm.H.PET              197 non-null    float64
 150  ZSNU_norm.H.PET              197 non-null    float64
 151  GLVAR_area.H.PET             197 non-null    float64
 152  ZSVAR_H.PET                  197 non-null    float64
 153  Entropy_area.H.PET           197 non-null    float64
 154  Max_cooc.W.PET               197 non-null    float64
 155  Average_cooc.W.PET           197 non-null    float64
 156  Variance_cooc.W.PET          197 non-null    float64
 157  Entropy_cooc.W.PET           197 non-null    float64
 158  DAVE_cooc.W.PET              197 non-null    float64
 159  DVAR_cooc.W.PET              197 non-null    float64
 160  DENT_cooc.W.PET              197 non-null    float64
 161  SAVE_cooc.W.PET              197 non-null    float64
 162  SVAR_cooc.W.PET              197 non-null    float64
 163  SENT_cooc.W.PET              197 non-null    float64
 164  ASM_cooc.W.PET               197 non-null    float64
 165  Contrast_cooc.W.PET          197 non-null    float64
 166  Dissimilarity_cooc.W.PET     197 non-null    float64
 167  Inv_diff_cooc.W.PET          197 non-null    float64
 168  Inv_diff_norm_cooc.W.PET     197 non-null    float64
 169  IDM_cooc.W.PET               197 non-null    float64
 170  IDM_norm_cooc.W.PET          197 non-null    float64
 171  Inv_var_cooc.W.PET           197 non-null    float64
 172  Correlation_cooc.W.PET       197 non-null    float64
 173  Autocorrelation_cooc.W.PET   197 non-null    float64
 174  Tendency_cooc.W.PET          197 non-null    float64
 175  Shade_cooc.W.PET             197 non-null    float64
 176  Prominence_cooc.W.PET        197 non-null    float64
 177  IC1_d.W.PET                  197 non-null    float64
 178  IC2_d.W.PET                  197 non-null    float64
 179  Coarseness_vdif.W.PET        197 non-null    float64
 180  Contrast_vdif.W.PET          197 non-null    float64
 181  Busyness_vdif.W.PET          197 non-null    float64
 182  Complexity_vdif.W.PET        197 non-null    float64
 183  Strength_vdif.W.PET          197 non-null    float64
 184  SRE_align.W.PET              197 non-null    float64
 185  LRE_align.W.PET              197 non-null    float64
 186  GLNU_align.W.PET             197 non-null    float64
 187  RLNU_align.W.PET             197 non-null    float64
 188  RP_align.W.PET               197 non-null    float64
 189  LGRE_align.W.PET             197 non-null    float64
 190  HGRE_align.W.PET             197 non-null    float64
 191  LGSRE_align.W.PET            197 non-null    float64
 192  HGSRE_align.W.PET            197 non-null    float64
 193  LGHRE_align.W.PET            197 non-null    float64
 194  HGLRE_align.W.PET            197 non-null    float64
 195  GLNU_norm_align.W.PET        197 non-null    float64
 196  RLNU_norm_align.W.PET        197 non-null    float64
 197  GLVAR_align.W.PET            197 non-null    float64
 198  RLVAR_align.W.PET            197 non-null    float64
 199  Entropy_align.W.PET          197 non-null    float64
 200  SZSE.W.PET                   197 non-null    float64
 201  LZSE.W.PET                   197 non-null    float64
 202  LGLZE.W.PET                  197 non-null    float64
 203  HGLZE.W.PET                  197 non-null    float64
 204  SZLGE.W.PET                  197 non-null    float64
 205  SZHGE.W.PET                  197 non-null    float64
 206  LZLGE.W.PET                  197 non-null    float64
 207  LZHGE.W.PET                  197 non-null    float64
 208  GLNU_area.W.PET              197 non-null    float64
 209  ZSNU.W.PET                   197 non-null    float64
 210  ZSP.W.PET                    197 non-null    float64
 211  GLNU_norm.W.PET              197 non-null    float64
 212  ZSNU_norm.W.PET              197 non-null    float64
 213  GLVAR_area.W.PET             197 non-null    float64
 214  ZSVAR.W.PET                  197 non-null    float64
 215  Entropy_area.W.PET           197 non-null    float64
 216  Min_hist.ADC                 197 non-null    float64
 217  Max_hist.ADC                 197 non-null    float64
 218  Mean_hist.ADC                197 non-null    float64
 219  Variance_hist.ADC            197 non-null    float64
 220  Standard_Deviation_hist.ADC  197 non-null    float64
 221  Skewness_hist.ADC            197 non-null    float64
 222  Kurtosis_hist.ADC            197 non-null    float64
 223  Energy_hist.ADC              197 non-null    float64
 224  Entropy_hist.ADC             197 non-null    float64
 225  AUC_hist.ADC                 197 non-null    float64
 226  Volume.ADC                   197 non-null    float64
 227  X3D_surface.ADC              197 non-null    float64
 228  ratio_3ds_vol.ADC            197 non-null    float64
 229  ratio_3ds_vol_norm.ADC       197 non-null    float64
 230  irregularity.ADC             197 non-null    float64
 231  Compactness_v1.ADC           197 non-null    float64
 232  Compactness_v2.ADC           197 non-null    float64
 233  Spherical_disproportion.ADC  197 non-null    float64
 234  Sphericity.ADC               197 non-null    float64
 235  Asphericity.ADC              197 non-null    float64
 236  Center_of_mass.ADC           197 non-null    float64
 237  Max_3D_diam.ADC              197 non-null    float64
 238  Major_axis_length.ADC        197 non-null    float64
 239  Minor_axis_length.ADC        197 non-null    float64
 240  Least_axis_length.ADC        197 non-null    float64
 241  Elongation.ADC               197 non-null    float64
 242  Flatness.ADC                 197 non-null    float64
 243  Max_cooc.L.ADC               197 non-null    float64
 244  Average_cooc.L.ADC           197 non-null    float64
 245  Variance_cooc.L.ADC          197 non-null    float64
 246  Entropy_cooc.L.ADC           197 non-null    float64
 247  DAVE_cooc.L.ADC              197 non-null    float64
 248  DVAR_cooc.L.ADC              197 non-null    float64
 249  DENT_cooc.L.ADC              197 non-null    float64
 250  SAVE_cooc.L.ADC              197 non-null    float64
 251  SVAR_cooc.L.ADC              197 non-null    float64
 252  SENT_cooc.L.ADC              197 non-null    float64
 253  ASM_cooc.L.ADC               197 non-null    float64
 254  Contrast_cooc.L.ADC          197 non-null    float64
 255  Dissimilarity_cooc.L.ADC     197 non-null    float64
 256  Inv_diff_cooc.L.ADC          197 non-null    float64
 257  Inv_diff_norm_cooc.L.ADC     197 non-null    float64
 258  IDM_cooc.L.ADC               197 non-null    float64
 259  IDM_norm_cooc.L.ADC          197 non-null    float64
 260  Inv_var_cooc.L.ADC           197 non-null    float64
 261  Correlation_cooc.L.ADC       197 non-null    float64
 262  Autocorrelation_.L.ADC       197 non-null    float64
 263  Tendency_cooc.L.ADC          197 non-null    float64
 264  Shade_.L.ADC                 197 non-null    float64
 265  Prominence_cooc.L.ADC        197 non-null    float64
 266  IC1_.L.ADC                   197 non-null    float64
 267  IC2_.L.ADC                   197 non-null    float64
 268  Coarseness_vdif_.L.ADC       197 non-null    float64
 269  Contrast_vdif_.L.ADC         197 non-null    float64
 270  Busyness_vdif_.L.ADC         197 non-null    float64
 271  Complexity_vdif_.L.ADC       197 non-null    float64
 272  Strength_vdif_.L.ADC         197 non-null    float64
 273  SRE_align.L.ADC              197 non-null    float64
 274  LRE_align.L.ADC              197 non-null    float64
 275  GLNU_align.L.ADC             197 non-null    float64
 276  RLNU_align.L.ADC             197 non-null    float64
 277  RP_align.L.ADC               197 non-null    float64
 278  LGRE_align.L.ADC             197 non-null    float64
 279  HGRE_align.L.ADC             197 non-null    float64
 280  LGSRE_align.L.ADC            197 non-null    float64
 281  HGSRE_align.L.ADC            197 non-null    float64
 282  LGHRE_align.L.ADC            197 non-null    float64
 283  HGLRE_align.L.ADC            197 non-null    float64
 284  GLNU_norm_align.L.ADC        197 non-null    float64
 285  RLNU_norm_align.L.ADC        197 non-null    float64
 286  GLVAR_align.L.ADC            197 non-null    float64
 287  RLVAR_align.L.ADC            197 non-null    float64
 288  Entropy_align.L.ADC          197 non-null    float64
 289  SZSE.L.ADC                   197 non-null    float64
 290  LZSE.L.ADC                   197 non-null    float64
 291  LGLZE.L.ADC                  197 non-null    float64
 292  HGLZE.L.ADC                  197 non-null    float64
 293  SZLGE.L.ADC                  197 non-null    float64
 294  SZHGE.L.ADC                  197 non-null    float64
 295  LZLGE.L.ADC                  197 non-null    float64
 296  LZHGE.L.ADC                  197 non-null    float64
 297  GLNU_area.L.ADC              197 non-null    float64
 298  ZSNU.L.ADC                   197 non-null    float64
 299  ZSP.L.ADC                    197 non-null    float64
 300  GLNU_norm.L.ADC              197 non-null    float64
 301  ZSNU_norm.L.ADC              197 non-null    float64
 302  GLVAR_area.L.ADC             197 non-null    float64
 303  ZSVAR.L.ADC                  197 non-null    float64
 304  Entropy_area.L.ADC           197 non-null    float64
 305  Max_cooc.H.ADC               197 non-null    float64
 306  Average_cooc.H.ADC           197 non-null    float64
 307  Variance_cooc.H.ADC          197 non-null    float64
 308  Entropy_cooc.H.ADC           197 non-null    float64
 309  DAVE_cooc.H.ADC              197 non-null    float64
 310  DVAR_cooc.H.ADC              197 non-null    float64
 311  DENT_cooc.H.ADC              197 non-null    float64
 312  SAVE_cooc.H.ADC              197 non-null    float64
 313  SVAR_cooc.H.ADC              197 non-null    float64
 314  SENT_cooc.H.ADC              197 non-null    float64
 315  ASM_cooc.H.ADC               197 non-null    float64
 316  Contrast_cooc.H.ADC          197 non-null    float64
 317  Dissimilarity_cooc.H.ADC     197 non-null    float64
 318  Inv_diff_cooc.H.ADC          197 non-null    float64
 319  Inv_diff_norm_cooc.H.ADC     197 non-null    float64
 320  IDM_cooc.H.ADC               197 non-null    float64
 321  IDM_norm_cooc.H.ADC          197 non-null    float64
 322  Inv_var_cooc.H.ADC           197 non-null    float64
 323  Correlation_cooc.H.ADC       197 non-null    float64
 324  Autocorrelation_cooc.H.ADC   197 non-null    float64
 325  Tendency_cooc.H.ADC          197 non-null    float64
 326  Shade_cooc.H.ADC             197 non-null    float64
 327  Prominence_cooc.H.ADC        197 non-null    float64
 328  IC1_d.H.ADC                  197 non-null    float64
 329  IC2_d.H.ADC                  197 non-null    float64
 330  Coarseness_vdif.H.ADC        197 non-null    float64
 331  Contrast_vdif.H.ADC          197 non-null    float64
 332  Busyness_vdif.H.ADC          197 non-null    float64
 333  Complexity_vdif.H.ADC        197 non-null    float64
 334  Strength_vdif.H.ADC          197 non-null    float64
 335  SRE_align.H.ADC              197 non-null    float64
 336  LRE_align.H.ADC              197 non-null    float64
 337  GLNU_align.H.ADC             197 non-null    float64
 338  RLNU_align.H.ADC             197 non-null    float64
 339  RP_align.H.ADC               197 non-null    float64
 340  LGRE_align.H.ADC             197 non-null    float64
 341  HGRE_align.H.ADC             197 non-null    float64
 342  LGSRE_align.H.ADC            197 non-null    float64
 343  HGSRE_align.H.ADC            197 non-null    float64
 344  LGHRE_align.H.ADC            197 non-null    float64
 345  HGLRE_align.H.ADC            197 non-null    float64
 346  GLNU_norm_align.H.ADC        197 non-null    float64
 347  RLNU_norm_align.H.ADC        197 non-null    float64
 348  GLVAR_align.H.ADC            197 non-null    float64
 349  RLVAR_align.H.ADC            197 non-null    float64
 350  Entropy_align.H.ADC          197 non-null    float64
 351  SZSE.H.ADC                   197 non-null    float64
 352  LZSE.H.ADC                   197 non-null    float64
 353  LGLZE.H.ADC                  197 non-null    float64
 354  HGLZE.H.ADC                  197 non-null    float64
 355  SZLGE.H.ADC                  197 non-null    float64
 356  SZHGE.H.ADC                  197 non-null    float64
 357  LZLGE.H.ADC                  197 non-null    float64
 358  LZHGE.H.ADC                  197 non-null    float64
 359  GLNU_area.H.ADC              197 non-null    float64
 360  ZSNU.H.ADC                   197 non-null    float64
 361  ZSP.H.ADC                    197 non-null    float64
 362  GLNU_norm.H.ADC              197 non-null    float64
 363  ZSNU_norm.H.ADC              197 non-null    float64
 364  GLVAR_area.H.ADC             197 non-null    float64
 365  ZSVAR.H.ADC                  197 non-null    float64
 366  Entropy_area.H.ADC           197 non-null    float64
 367  Max_cooc.W.ADC               197 non-null    float64
 368  Average_cooc.W.ADC           197 non-null    float64
 369  Variance_cooc.W.ADC          197 non-null    float64
 370  DAVE_cooc.W.ADC              197 non-null    float64
 371  DVAR_cooc.W.ADC              197 non-null    float64
 372  DENT_cooc.W.ADC              197 non-null    float64
 373  SAVE_cooc.W.ADC              197 non-null    float64
 374  SVAR_cooc.W.ADC              197 non-null    float64
 375  SENT_cooc.W.ADC              197 non-null    float64
 376  ASM_cooc.W.ADC               197 non-null    float64
 377  Contrast_cooc.W.ADC          197 non-null    float64
 378  Dissimilarity_cooc.W.ADC     197 non-null    float64
 379  Inv_diff_cooc.W.ADC          197 non-null    float64
 380  Inv_diff_norm_cooc.W.ADC     197 non-null    float64
 381  IDM_cooc.W.ADC               197 non-null    float64
 382  IDM_norm_cooc.W.ADC          197 non-null    float64
 383  Inv_var_cooc.W.ADC           197 non-null    float64
 384  Correlation_cooc.W.ADC       197 non-null    float64
 385  Autocorrelation_cooc.W.ADC   197 non-null    float64
 386  Tendency_cooc.W.ADC          197 non-null    float64
 387  Shade_cooc.W.ADC             197 non-null    float64
 388  Prominence_cooc.W.ADC        197 non-null    float64
 389  IC1_d.W.ADC                  197 non-null    float64
 390  IC2_d.W.ADC                  197 non-null    float64
 391  Coarseness_vdif.W.ADC        197 non-null    float64
 392  Contrast_vdif.W.ADC          197 non-null    float64
 393  Busyness_vdif.W.ADC          197 non-null    float64
 394  Complexity_vdif.W.ADC        197 non-null    float64
 395  Strength_vdif.W.ADC          197 non-null    float64
 396  SRE_align.W.ADC              197 non-null    float64
 397  LRE_align.W.ADC              197 non-null    float64
 398  GLNU_align.W.ADC             197 non-null    float64
 399  RLNU_align.W.ADC             197 non-null    float64
 400  RP_align.W.ADC               197 non-null    float64
 401  LGRE_align.W.ADC             197 non-null    float64
 402  HGRE_align.W.ADC             197 non-null    float64
 403  LGSRE_align.W.ADC            197 non-null    float64
 404  HGSRE_align.W.ADC            197 non-null    float64
 405  LGHRE_align.W.ADC            197 non-null    float64
 406  HGLRE_align.W.ADC            197 non-null    float64
 407  GLNU_norm_align.W.ADC        197 non-null    float64
 408  RLNU_norm_align.W.ADC        197 non-null    float64
 409  GLVAR_align.W.ADC            197 non-null    float64
 410  RLVAR_align.W.ADC            197 non-null    float64
 411  Entropy_align.W.ADC          197 non-null    float64
 412  SZSE.W.ADC                   197 non-null    float64
 413  LZSE.W.ADC                   197 non-null    float64
 414  LGLZE.W.ADC                  197 non-null    float64
 415  HGLZE.W.ADC                  197 non-null    float64
 416  SZLGE.W.ADC                  197 non-null    float64
 417  SZHGE.W.ADC                  197 non-null    float64
 418  LZLGE.W.ADC                  197 non-null    float64
 419  LZHGE.W.ADC                  197 non-null    float64
 420  GLNU_area.W.ADC              197 non-null    float64
 421  ZSNU.W.ADC                   197 non-null    float64
 422  ZSP.W.ADC                    197 non-null    float64
 423  GLNU_norm.W.ADC              197 non-null    float64
 424  ZSNU_norm.W.ADC              197 non-null    float64
 425  GLVAR_area.W.ADC             197 non-null    float64
 426  ZSVAR.W.ADC                  197 non-null    float64
 427  Entropy_area.W.ADC           197 non-null    float64
 428  Failure.binary               197 non-null    int64  
dtypes: float64(428), int64(1)
memory usage: 661.8 KB
In [ ]:
df.dtypes.value_counts()
Out[ ]:
float64    428
int64        1
dtype: int64
In [ ]:
df.describe()
Out[ ]:
Entropy_cooc.W.ADC GLNU_align.H.PET Min_hist.PET Max_hist.PET Mean_hist.PET Variance_hist.PET Standard_Deviation_hist.PET Skewness_hist.PET Kurtosis_hist.PET Energy_hist.PET ... LZHGE.W.ADC GLNU_area.W.ADC ZSNU.W.ADC ZSP.W.ADC GLNU_norm.W.ADC ZSNU_norm.W.ADC GLVAR_area.W.ADC ZSVAR.W.ADC Entropy_area.W.ADC Failure.binary
count 197.000000 197.000000 197.000000 197.000000 197.000000 197.000000 197.000000 197.000000 197.000000 197.000000 ... 197.000000 197.000000 197.000000 197.000000 197.000000 197.000000 197.000000 197.000000 197.000000 197.000000
mean 12.278600 95.381938 8.513255 24.271413 13.008133 9.257452 3.049220 0.911980 0.490932 0.003647 ... 13333.581481 40.154389 3334.075705 1.193836 0.016572 1.157634 1114.711636 0.065497 8.507117 0.340102
std 1.039816 86.089059 4.985543 14.779666 7.668180 9.303475 1.848637 0.691920 3.041625 0.028831 ... 9140.346577 52.092487 4751.131998 0.423532 0.031414 0.412787 755.908819 0.056252 2.995206 0.474950
min 9.532740 9.445031 1.484508 4.164474 2.424636 0.178752 0.419449 -0.001136 -2.266122 -0.063283 ... 1369.130190 2.015900 84.039160 0.851807 -0.054262 0.792028 253.629375 -0.029824 5.585010 0.000000
25% 11.558840 37.518193 5.151990 13.071684 7.497794 2.258260 1.639108 0.444828 -0.525860 -0.012100 ... 6881.763841 9.340283 741.277380 0.945840 0.001476 0.908540 564.917867 0.031800 6.626250 0.000000
50% 12.278790 80.034684 7.388754 21.013614 11.449486 6.450421 2.734120 0.734796 -0.167186 0.007731 ... 11685.594830 20.363374 1479.035520 0.966065 0.018532 0.938043 983.073750 0.055972 7.025632 0.000000
75% 12.977330 112.145185 11.005214 33.761142 17.386702 12.682440 4.209453 1.199956 0.501737 0.020205 ... 17172.909510 48.480280 3976.605794 1.797414 0.033476 1.677856 1295.180470 0.091940 11.170020 1.000000
max 14.510471 559.351571 28.404496 79.985858 44.043168 49.012054 9.929300 4.901172 33.742118 0.089760 ... 51885.362160 387.348504 35037.698160 1.980520 0.086040 2.007120 4306.766300 0.318752 15.380880 1.000000

8 rows × 429 columns

The class distribution of the samples is kind of imbalanced with 130 of those class 0 and 67 of class 1 but not so bad

In [ ]:
df['Failure.binary'].value_counts()
Out[ ]:
0    130
1     67
Name: Failure.binary, dtype: int64

Determining how many null is in the dataframe

In [ ]:
df.isnull().sum().value_counts()
Out[ ]:
0    429
dtype: int64

Assigning the main variables¶

In [ ]:
var_X = df.iloc[:, :-1]
var_y = df.iloc[:, -1]
In [ ]:
var_X.head()
Out[ ]:
Entropy_cooc.W.ADC GLNU_align.H.PET Min_hist.PET Max_hist.PET Mean_hist.PET Variance_hist.PET Standard_Deviation_hist.PET Skewness_hist.PET Kurtosis_hist.PET Energy_hist.PET ... LZLGE.W.ADC LZHGE.W.ADC GLNU_area.W.ADC ZSNU.W.ADC ZSP.W.ADC GLNU_norm.W.ADC ZSNU_norm.W.ADC GLVAR_area.W.ADC ZSVAR.W.ADC Entropy_area.W.ADC
PatientID
1 12.85352 46.256345 6.249117 17.825541 9.783773 6.814365 2.612479 0.688533 -0.339727 0.005095 ... 0.00690 6201.93480 4.13400 239.28938 0.97918 0.01899 0.95586 1145.10496 0.02586 6.28632
2 12.21115 27.454540 11.005214 26.469077 15.426640 12.932074 3.598298 0.789526 -0.319613 0.006297 ... 0.00423 16054.01263 8.37627 644.73702 0.95637 0.01461 0.93288 847.52537 0.04153 6.77853
3 12.75682 90.195696 2.777718 6.877486 4.295330 0.923425 0.962163 0.248637 -0.944246 0.005015 ... 0.00453 6674.63840 13.11686 1165.70261 0.97268 0.02501 0.91537 1923.85705 0.07104 7.15685
4 13.46730 325.643330 6.296588 22.029843 10.334779 6.649795 2.580759 0.832011 0.855861 0.003289 ... 0.00888 17172.90951 23.84726 2760.41293 0.97203 0.01069 0.94658 1329.95290 0.03848 7.29521
5 12.63733 89.579042 3.583846 7.922501 4.454175 0.572094 0.757225 1.574845 3.250288 0.008066 ... 0.00405 13231.94294 8.14437 784.59729 0.96469 0.02526 0.93769 1116.38669 0.05223 7.05149

5 rows × 428 columns

In [ ]:
var_y.head()
Out[ ]:
PatientID
1    0
2    1
3    0
4    1
5    0
Name: Failure.binary, dtype: int64

Exploratory data analysis¶

Feature distribution overview¶

As the dataset contains 429 columns, plotting pairplot to get a general overview of the dataset's feature relationships and distribution would take so much time to compute.

With that, I will only be including features that have a 0.23 and above correlation with the target column 'Failure.binary'

In [ ]:
column_dist_overview_extract_by_corr = list(
    df.corr()[['Failure.binary']]
    .rename(columns = {'Failure.binary': 'Correlation with target'})
    .query('`Correlation with target` >= 0.23')
    .sort_values(by = 'Correlation with target', ascending = False)
    .index)

print(column_dist_overview_extract_by_corr)
['Failure.binary', 'Entropy_cooc.W.ADC', 'GLNU_align.H.PET', 'Complexity_vdif.W.ADC', 'Prominence_cooc.W.ADC', 'DVAR_cooc.W.ADC', 'Contrast_cooc.W.ADC']
In [ ]:
def plot_pairplot(
    data, hue = None, palette = None,
    height: float = 3.25, aspect: float = 1, title: str = None
) -> sns.pairplot:

    palette = palette_a1 if palette is None else palette
    title = ' ' if title is None else title

    sns.set(style = 'whitegrid')

    g = sns.pairplot(
        data = data, hue = hue, palette = palette, 
        height = height, aspect = aspect)

    g.fig.suptitle(
        t = "".join([(' '), ('\n'*4), (title)]).upper(), y = 1.105, x = 0.02,
        horizontalalignment = 'left', fontsize = 20, fontweight = 'bold')

    for axes in g.axes:
        axes[0].set_ylabel(
            ylabel = "".join([('\n'*5), (axes[0].get_ylabel()), ('\n')]),
            fontdict = dict(fontsize = 13.5, fontweight = 'medium'))

    for axes in g.axes[len(g.axes)-1]:
        axes.set_xlabel(
            xlabel = "".join([('\n'), (axes.get_xlabel()), ('\n'*5)]),
            fontdict = dict(fontsize = 13.5, fontweight = 'medium'))    

    sns.move_legend(
        obj = g, loc = 'lower left', bbox_to_anchor = (0.01, 1), 
        ncol = 3, title = None, frameon = False,
        markerscale = 2,)

    plt.text(
        s = ' ', x = 1.3, y = 1,
        transform = g.axes[0, -1].transAxes)

    return g
In [ ]:
plot_pairplot(
    data = df[column_dist_overview_extract_by_corr], hue = 'Failure.binary',
    title = 'Distribution overview of features with 0.23 and above correlation to target'
);

Feature correlation overview¶

In [ ]:
column_corr_overview_extract_by_corr = list(
    df.corr()[['Failure.binary']]
    .rename(columns = {'Failure.binary': 'Correlation with target'})
    .query('`Correlation with target` > 0.2 | `Correlation with target` < -0.2')
    .index)

print(column_corr_overview_extract_by_corr)
['Entropy_cooc.W.ADC', 'GLNU_align.H.PET', 'DVAR_cooc.L.PET', 'Contrast_cooc.L.PET', 'IC1_d.H.PET', 'LZLGE.H.PET', 'Min_hist.ADC', 'Variance_hist.ADC', 'ratio_3ds_vol.ADC', 'IC1_.L.ADC', 'Strength_vdif_.L.ADC', 'IC1_d.H.ADC', 'Strength_vdif.H.ADC', 'Variance_cooc.W.ADC', 'DVAR_cooc.W.ADC', 'SVAR_cooc.W.ADC', 'Contrast_cooc.W.ADC', 'Tendency_cooc.W.ADC', 'Prominence_cooc.W.ADC', 'Complexity_vdif.W.ADC', 'GLVAR_align.W.ADC', 'GLVAR_area.W.ADC', 'Failure.binary']
In [ ]:
def plot_heatmap(
    data, figsize = (16, 8), cmap = sns.color_palette(palette = palette_a1, as_cmap = True),
    linewidths: int = 1, annot: bool = True, fmt: str = ".1f",
    title: str = None

) -> sns.heatmap:

    title = ' ' if title is None else title

    sns.set(style = 'whitegrid', rc = {'figure.figsize': figsize})

    g = sns.heatmap(
        data = data, cmap = cmap, linewidths = linewidths, 
        annot = annot, fmt = fmt, annot_kws = dict(fontsize = 9.5, alpha = 0.8))

    g.set_xlabel(xlabel = "".join([(' '), ('\n'*3), (' ') ]))
    g.set_ylabel(ylabel = "".join([(' '), ('\n'*3), (' ') ]))

    g.set_title(
        label = "".join([(' '), ('\n'*4), (title), ('\n'*1)]).upper(),
        fontdict = dict(fontsize = 14, fontweight = 'bold'), loc = 'left')

    plt.text(
        s = ' ', x = 1.23, y = 1,
        transform = g.axes.transAxes, )

    return g
In [ ]:
plot_heatmap(
    data = df[column_corr_overview_extract_by_corr].corr(),
    title = 'Correlation overview of features with a correlation of > 0.2 or < -0.2 to target'
);

From the plot, Entropy_cooc.W.ADC seems to be the most prominent out of all the features in explaining the target class outcome with a correlation value of 0.7 as opposed to the other features which got a correlation value of 0.4 and below

Feature scaling, dimensionality reduction, and feature extraction¶

It is a common practice to apply principal component analysis before executing clustering algorithms as it will help in finding latent features among the data, reduces dimensions, and therefore, decrease computational cost.

With the features being of different scale, it will be better to normalize rather than standardize. Thus, I will use MinMaxScaler()

In [ ]:
def pipe_scale_dimensionreduction(method) -> Pipeline:
    return Pipeline(steps = [
        ('scale', MinMaxScaler()),
        ('method', method), ])

Specifying to retain 95% of the variance in the original data after applying PCA

In [ ]:
method_pca = (
    pipe_scale_dimensionreduction(
        method = PCA(n_components = 0.95, random_state = 0))
    .fit(var_X))

Getting the number of components the algorithm has selected

In [ ]:
method_pca_n_components = method_pca.steps[1][1].n_components_
method_pca_n_components
Out[ ]:
11

Transforming the variable X using PCA and assigning it to a dataframe

In [ ]:
df_transformed = (
    pd.DataFrame(
        data = method_pca.transform(var_X),
        columns = [f'PC{x}' for x in range(1, method_pca_n_components + 1)], )
    .assign(**{'Failure.binary': var_y.to_numpy()}))

df_transformed.head()
Out[ ]:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 Failure.binary
0 -1.999969 -0.662641 0.210586 -0.585717 0.285236 -0.466644 -0.174265 -0.204481 0.074304 -0.042223 -0.309827 0
1 -1.966676 -0.227635 0.745383 -0.499318 -0.696014 -0.355153 0.389122 0.128643 -0.041175 -0.128747 0.113712 1
2 -1.943384 -1.076428 -0.625807 0.413053 0.448163 -0.471302 -1.061687 -0.031165 -0.046357 -0.159227 -0.242236 0
3 -2.022240 -0.109832 0.567861 0.921422 0.146706 -0.052499 0.213032 -0.350346 -0.214044 -0.213044 -0.137190 1
4 -2.228796 -1.305999 -0.983146 0.678336 -0.049996 -0.026673 0.358001 0.303194 0.429558 0.473278 0.209292 0
In [ ]:
df_transformed.describe()
Out[ ]:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 Failure.binary
count 1.970000e+02 197.000000 1.970000e+02 1.970000e+02 1.970000e+02 1.970000e+02 1.970000e+02 1.970000e+02 1.970000e+02 1.970000e+02 1.970000e+02 197.000000
mean -1.442726e-16 0.000000 -9.017040e-17 -5.410224e-17 5.635650e-17 -6.988206e-17 4.508520e-18 -1.352556e-17 4.508520e-18 -8.453475e-18 -3.465925e-17 0.340102
std 3.875903e+00 1.395630 1.101858e+00 9.883646e-01 6.834688e-01 6.450541e-01 5.914718e-01 4.895602e-01 4.812307e-01 3.868255e-01 3.601993e-01 0.474950
min -3.516060e+00 -3.086733 -3.010406e+00 -3.331563e+00 -2.459694e+00 -2.227148e+00 -1.296917e+00 -1.212472e+00 -1.321106e+00 -8.453350e-01 -1.073466e+00 0.000000
25% -2.230884e+00 -0.931490 -6.600262e-01 -6.433395e-01 -4.827580e-01 -3.303239e-01 -4.035737e-01 -2.903467e-01 -2.040759e-01 -2.714878e-01 -2.354584e-01 0.000000
50% -1.960455e+00 -0.297432 5.839066e-02 -2.689091e-02 3.781587e-02 1.894101e-02 -2.999805e-02 -8.642730e-02 3.540982e-02 -4.359250e-02 -2.517608e-02 0.000000
75% 5.367085e+00 0.736635 7.453831e-01 5.561163e-01 3.918559e-01 3.485577e-01 2.839140e-01 1.970778e-01 2.385197e-01 2.469965e-01 2.092923e-01 1.000000
max 7.762937e+00 3.804354 3.588896e+00 3.361647e+00 2.555962e+00 2.367281e+00 2.361818e+00 2.126416e+00 1.861162e+00 1.342040e+00 1.307724e+00 1.000000

Getting the PCA explained variance

In [ ]:
method_pca_explainedvariances = method_pca.steps[1][1].explained_variance_ratio_
print(method_pca_explainedvariances)
[0.67558736 0.08759435 0.05459928 0.0439309  0.02100744 0.01871234
 0.01573272 0.01077824 0.01041459 0.00672924 0.00583474]

Visualizing the explained variances by principal components¶

In [ ]:
def plot_lineplot(
    data, x, y, hue = None, figsize: tuple = (15, 4), 
    x_label: str = None, y_label: str = None, err_style: str = None, 
    title: str = None, subtitle: str = None, style: str = None,
    legend_ncol: int = 5,
) -> sns.lineplot:

    color = color_a1 if hue is None else None
    palette = palette_a1 if hue is not None else None
    title = ' ' if title is None else title
    subtitle = ' ' if subtitle is None else ' ' if hue is not None else subtitle

    sns.set(style = 'whitegrid', rc = {'figure.figsize': figsize})

    g = sns.lineplot(
        data = data, x = x, y = y, hue = hue, palette = palette, 
        color = color, marker = 'o',err_style = err_style, style = style,
        err_kws = dict(alpha = 0.08))

    g.set_xlabel("".join([('\n'*1), (g.get_xlabel() if x_label is None else x_label), ('\n'*3)]))
    g.set_ylabel("".join([('\n'*3), (g.get_ylabel() if y_label is None else y_label), ('\n'*1)]))

    g.set_title(
        label = "".join([('\n'*3), (title), ('\n'*1)]).upper(), loc = 'left',
        fontdict = dict(fontsize = 14, fontweight = 'bold'),
        y = 1.08, )

    plt.text(
        s = subtitle, x = 0, y = 1.08,
        alpha = 0.5, horizontalalignment = 'left',
        transform = g.axes.transAxes,
        fontdict = dict(fontsize = 11),)

    plt.text(
        s = ' ', x = 1.08, y = 1,
        transform = g.axes.transAxes,)
        
    if hue is not None:
        sns.move_legend(
            obj = g, loc = 'lower left', bbox_to_anchor = (-0.01, 1), 
            ncol = legend_ncol, title = None, frameon = False,
            markerscale = 2, )

    sns.despine()

    return g
In [ ]:
plot_lineplot(
    data = pd.DataFrame(dict(
        principal_components = [f'PC{x}' for x in range(1, method_pca_n_components + 1)],
        explained_variances = method_pca_explainedvariances, )),
    x = 'principal_components', y = 'explained_variances', 
    x_label = 'Principal Components', y_label = 'Explained Variances',
    title = 'Principal component analysis: Explained variances by principal components',
    subtitle = f'Total retained variance: {np.sum(method_pca_explainedvariances): .2%}'
);

Workflow handling¶

Making a class to handle the same workflow of the classification models to save up processing time when computing the later on ROC AUC curve.

Using GridSearchCV for hyperparameter tuning with StratifiedKfold as the cross validator: N-fold cross validator with 10 as the default

In [ ]:
class ProcessModel():
    def __init__(
        self, dataframe: pd.DataFrame, model, 
        model_name: str, model_param_grid: dict = {}
        ) -> None:

        self.model_name = model_name
        self.dataframe = dataframe
        self.gscv = GridSearchCV(estimator = model, param_grid = model_param_grid)
        self.fit_model()
    

    def __set_primary_variables(self, target_colname: str = None) -> None:
        # Assign the target column as the last column if nothing is provided in the `target_colname` parameter
        self.target_colname = self.dataframe.columns[-1] if target_colname is None else target_colname

        # Assigning the primary variables
        self.var_X = self.dataframe.iloc[:, :-1]
        self.var_y = self.dataframe.iloc[:, -1]

    def get_X(self) -> Union[pd.DataFrame, pd.Series]:
        return self.var_X

    def get_y(self) -> Union[pd.DataFrame, pd.Series]:
        return self.var_y
        

    def __split_dataset(self) -> None:
        # Splitting the data to training and testing set and use cross validation in lieu to using validation set
        self.var_X_train, self.var_X_test, self.var_y_train, self.var_y_test = (
            train_test_split(self.var_X, self.var_y, stratify = self.var_y, random_state = 0,) )

    def get_splitted_data(self) -> dict:
        splitted_data = dict(
            X_train = self.var_X_train, X_test = self.var_X_test,
            y_train = self.var_y_train, y_test = self.var_y_test, )

        return splitted_data


    def _set_crossvalidator(self, cross_validator = None, cv_nsplits: int = 10) -> None:
        # N-fold cross validation with 10 as default and grid search through the parameter grid
        self.cross_validator = StratifiedKFold(n_splits = cv_nsplits) if cross_validator is None else cross_validator
        self.gscv.set_params(cv = self.cross_validator)

    def _get_crossvalidator(self) -> 'cross_validator':
        return self.cross_validator


    def get_predictions(self) -> dict:
        # Predicted y values
        predictions = dict(
            y_train = self.gscv.predict(self.var_X_train), 
            y_test = self.gscv.predict(self.var_X_test), )

        return predictions

    def get_confusion_matrices(self) -> dict:
        # Getting the train and test confusion matrix
        predictions = self.get_predictions()
        confusion_martrices = dict(
            train = confusion_matrix(y_true = self.var_y_train, y_pred = predictions['y_train']),
            test = confusion_matrix(y_true = self.var_y_test, y_pred = predictions['y_test']), )

        return confusion_martrices

    def get_subsets_by_confusion_matrix_dataframe(self) -> pd.DataFrame:
        # Make a dataframe out of the confusion matrix of the subsets
        confusion_matrices = self.get_confusion_matrices()
        processed_dataframe = pd.DataFrame(dict(
            element = ['True Negative', 'False Positive', 'False Negative', 'True Positive'] * 2,
            value = list(confusion_matrices['train'].ravel()) + list(confusion_matrices['test'].ravel()),
            set = ('Train ' * 4).split() + ('Test ' * 4).split(), ))

        return processed_dataframe

    def get_accuracy_scores(self) -> dict:
        # Evaluate train and test
        accuracy_scores = dict(
            train = self.gscv.score(X = self.var_X_train, y = self.var_y_train),
            test = self.gscv.score(X = self.var_X_test, y = self.var_y_test), )
        
        return accuracy_scores
        
    def get_classification_reports(self) -> dict:
        # Summarize classification result
        predictions = self.get_predictions()
        classification_reports = dict(
            train = classification_report(y_true = self.var_y_train, y_pred = predictions['y_train']),
            test = classification_report(y_true = self.var_y_test, y_pred = predictions['y_test']))

        return classification_reports

    def get_roc_curve(self) -> dict:
        # Getting roc curve of the test set
        if hasattr(self.gscv, "decision_function"):
            receiver_operating_characteristic = dict(zip(
                ['fpr', 'tpr', 'thresholds'], 
                roc_curve(y_true = self.var_y_test, y_score = self.gscv.decision_function(self.var_X_test)), ))
        else:
            receiver_operating_characteristic = dict(zip(
                ['fpr', 'tpr', 'thresholds'], 
                roc_curve(y_true = self.var_y_test, y_score = self.gscv.predict_proba(self.var_X_test)[:, 1]), ))            

        return receiver_operating_characteristic

    def get_auc(self) -> float:
        # Get auc
        receiver_operating_characteristic_curve = self.get_roc_curve()
        area_under_curve = auc(
            x = receiver_operating_characteristic_curve['fpr'], 
            y = receiver_operating_characteristic_curve['tpr'])

        return area_under_curve 

    def get_model_name(self) -> str:
        return self.model_name

    def get(self) -> GridSearchCV:
        # Get gridsearch instance
        return self.gscv

        
    def fit_model(self) -> None:
        self.__set_primary_variables()
        self.__split_dataset()
        self._set_crossvalidator()
        # Training the model
        self.gscv.fit(X = self.var_X_train, y = self.var_y_train)

Visualize model evaluation

In [ ]:
def visualize_model_evaluation(
    dataframe: pd.DataFrame, train_score: np.float64, test_score: np.float64, 
    classification_report_train: str, classification_report_test: str,
    palette = None, height: int = 3, aspect: int = 4, sharex: bool = False,
    title_model: str = None,
) -> sns.FacetGrid:

    palette = palette_a1 if palette is None else palette
    title_model = ' ' if title_model is None else title_model

    sns.set(style = 'whitegrid')

    g = sns.FacetGrid(
        data = dataframe, row = 'set', height = height, 
        aspect = aspect, sharex = sharex, )

    g.map_dataframe(
        func = sns.barplot, x = 'value', y = 'element', 
        palette = palette)

    g.fig.suptitle(
        "".join([(' '), ('\n'*3), ('Model: '), (title_model), ('\n'*1)]).upper(),
        fontweight = 'bold', y = 1.6, horizontalalignment = 'left', x = 0, transform = g.axes[0,0].transAxes)

    plt.text(
        s = "Model evaluation on the given dataset", x = 0, y = 1.05,
        alpha = 0.5, transform = g.axes[0,0].transAxes)

    plt.text(
        s = classification_report_train.upper(), x = 1.1, y = 0,
        alpha = 0.8, transform = g.axes[0,0].transAxes,
        fontdict = dict(fontfamily = 'monospace'))

    plt.text(
        s = classification_report_test.upper(), x = 1.1, y = 0,
        alpha = 0.8, transform = g.axes[1,0].transAxes,
        fontdict = dict(fontfamily = 'monospace'))        

    g.set_titles(row_template = '{row_name} Set',)
        
    for axes in g.axes:
        for axis in axes:
            axis.bar_label(axis.containers[0], padding = 10)
            axis.set_ylabel("".join([(' '), ('\n'*3)]))
            axis.set_xlabel(
                xlabel = (
                    "".join([(' '), ('\n'*1), (axis.get_title()), (': '), 
                    ('Accuracy Score = '), (f'{train_score: .3f}' if axis == g.axes[0,0] else f'{test_score: .3f}'), 
                    ('\n'*3)])),
                fontdict = dict(fontsize = 13, fontweight = 'medium'), 
                loc = 'left')
            axis.set_title(' ')

    plt.text(
        s = ' ', x = 1.7, y = 1,
        transform = g.axes[0,0].transAxes)

    plt.subplots_adjust(hspace = 0.5)

    sns.despine()

    return g

Execution¶

In [ ]:
def evaluate_model(processed_model: ProcessModel) -> sns.FacetGrid:
    dataframe = processed_model.get_subsets_by_confusion_matrix_dataframe()
    classification_reports = processed_model.get_classification_reports()
    accuracy_scores = processed_model.get_accuracy_scores()
    title_model = processed_model.get_model_name()

    return visualize_model_evaluation(
        dataframe = dataframe, title_model = title_model,
        train_score = accuracy_scores['train'], test_score = accuracy_scores['test'],
        classification_report_train = classification_reports['train'],
        classification_report_test = classification_reports['test'], );

Logistic regression¶

In [ ]:
model_logreg = ProcessModel(
    dataframe = df_transformed, model = LogisticRegression(max_iter = 5000),
    model_name = "Logistic Regression",
    model_param_grid = {
        'C': np.linspace(0.01, 100, num = 50),
        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}, )

Grid search and cross validation result

In [ ]:
pd.DataFrame(model_logreg.get().cv_results_).sort_values(by = 'rank_test_score').head()
Out[ ]:
mean_fit_time std_fit_time mean_score_time std_score_time param_C param_penalty param_solver params split0_test_score split1_test_score ... split3_test_score split4_test_score split5_test_score split6_test_score split7_test_score split8_test_score split9_test_score mean_test_score std_test_score rank_test_score
33 0.006496 0.000875 0.002558 0.000296 2.050612 l2 sag {'C': 2.050612244897959, 'penalty': 'l2', 'sol... 0.8 0.733333 ... 0.8 0.8 0.933333 0.733333 0.785714 0.857143 0.642857 0.801905 0.085189 1
30 0.014095 0.001327 0.002350 0.000575 2.050612 l2 newton-cg {'C': 2.050612244897959, 'penalty': 'l2', 'sol... 0.8 0.733333 ... 0.8 0.8 0.933333 0.733333 0.785714 0.857143 0.642857 0.801905 0.085189 1
31 0.012932 0.001140 0.002911 0.000449 2.050612 l2 lbfgs {'C': 2.050612244897959, 'penalty': 'l2', 'sol... 0.8 0.733333 ... 0.8 0.8 0.933333 0.733333 0.785714 0.857143 0.642857 0.801905 0.085189 1
34 0.008778 0.000762 0.002644 0.000609 2.050612 l2 saga {'C': 2.050612244897959, 'penalty': 'l2', 'sol... 0.8 0.733333 ... 0.8 0.8 0.933333 0.733333 0.785714 0.857143 0.642857 0.801905 0.085189 1
27 0.003686 0.000786 0.002455 0.000428 2.050612 l1 liblinear {'C': 2.050612244897959, 'penalty': 'l1', 'sol... 0.8 0.733333 ... 0.8 0.8 0.866667 0.733333 0.785714 0.857143 0.642857 0.795238 0.076842 5

5 rows × 21 columns

Model best parameters by grid search

In [ ]:
model_logreg.get().best_params_
Out[ ]:
{'C': 2.050612244897959, 'penalty': 'l2', 'solver': 'newton-cg'}

Model evaluation: Visualized

In [ ]:
evaluate_model(model_logreg)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x7fb2979ca0a0>

K-nearest neighbors¶

In [ ]:
model_knn = ProcessModel(
    dataframe = df_transformed, model = KNeighborsClassifier(),
    model_name = "K-Nearest Neighbors",
    model_param_grid = {
        'n_neighbors': range(3, 25),
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'metric': ['minkowski', 'euclidean', 'manhattan', 'chebyshev']}, )

Grid search and cross validation result

In [ ]:
pd.DataFrame(model_knn.get().cv_results_).sort_values(by = 'rank_test_score').head()
Out[ ]:
mean_fit_time std_fit_time mean_score_time std_score_time param_algorithm param_metric param_n_neighbors param_weights params split0_test_score ... split3_test_score split4_test_score split5_test_score split6_test_score split7_test_score split8_test_score split9_test_score mean_test_score std_test_score rank_test_score
271 0.003294 0.000437 0.003647 0.000576 ball_tree manhattan 6 distance {'algorithm': 'ball_tree', 'metric': 'manhatta... 0.8 ... 0.733333 0.733333 0.933333 0.666667 0.785714 0.785714 0.714286 0.768571 0.080339 1
95 0.003065 0.000389 0.003718 0.000857 auto manhattan 6 distance {'algorithm': 'auto', 'metric': 'manhattan', '... 0.8 ... 0.733333 0.733333 0.933333 0.666667 0.785714 0.785714 0.714286 0.768571 0.080339 1
623 0.005213 0.001086 0.004970 0.000695 brute manhattan 6 distance {'algorithm': 'brute', 'metric': 'manhattan', ... 0.8 ... 0.733333 0.733333 0.933333 0.666667 0.785714 0.785714 0.714286 0.768571 0.080339 1
447 0.003401 0.000378 0.004017 0.000672 kd_tree manhattan 6 distance {'algorithm': 'kd_tree', 'metric': 'manhattan'... 0.8 ... 0.733333 0.733333 0.933333 0.666667 0.785714 0.785714 0.714286 0.768571 0.080339 1
662 0.004962 0.001235 0.005419 0.001320 brute chebyshev 4 uniform {'algorithm': 'brute', 'metric': 'chebyshev', ... 0.8 ... 0.800000 0.733333 0.800000 0.733333 0.714286 0.642857 0.642857 0.753333 0.081282 5

5 rows × 22 columns

Model best parameters by grid search

In [ ]:
model_knn.get().best_params_
Out[ ]:
{'algorithm': 'auto',
 'metric': 'manhattan',
 'n_neighbors': 6,
 'weights': 'distance'}

Model evaluation: Visualized

In [ ]:
evaluate_model(model_knn)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x7fb296cc1670>

Support vector machine¶

In [ ]:
model_svc = ProcessModel(
    dataframe = df_transformed, model = SVC(max_iter = 5000, random_state = 0),
    model_name = "Support Vector Classifier",
    model_param_grid = {
        'C': np.linspace(0.01, 50, 50),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': range(3, 7), 'gamma': ['scale', 'auto']}, )

Grid search and cross validation result

In [ ]:
pd.DataFrame(model_svc.get().cv_results_).sort_values(by = 'rank_test_score').head()
Out[ ]:
mean_fit_time std_fit_time mean_score_time std_score_time param_C param_degree param_gamma param_kernel params split0_test_score ... split3_test_score split4_test_score split5_test_score split6_test_score split7_test_score split8_test_score split9_test_score mean_test_score std_test_score rank_test_score
98 0.005873 0.001106 0.003280 0.000855 3.070612 3 scale rbf {'C': 3.0706122448979594, 'degree': 3, 'gamma'... 0.8 ... 0.733333 0.800000 0.933333 0.733333 0.785714 0.857143 0.714286 0.809048 0.090060 1
106 0.004595 0.000360 0.002955 0.000774 3.070612 4 scale rbf {'C': 3.0706122448979594, 'degree': 4, 'gamma'... 0.8 ... 0.733333 0.800000 0.933333 0.733333 0.785714 0.857143 0.714286 0.809048 0.090060 1
114 0.005251 0.001082 0.002443 0.000363 3.070612 5 scale rbf {'C': 3.0706122448979594, 'degree': 5, 'gamma'... 0.8 ... 0.733333 0.800000 0.933333 0.733333 0.785714 0.857143 0.714286 0.809048 0.090060 1
122 0.005394 0.000669 0.002979 0.000565 3.070612 6 scale rbf {'C': 3.0706122448979594, 'degree': 6, 'gamma'... 0.8 ... 0.733333 0.800000 0.933333 0.733333 0.785714 0.857143 0.714286 0.809048 0.090060 1
186 0.005594 0.000767 0.002814 0.000496 5.11102 6 scale rbf {'C': 5.111020408163266, 'degree': 6, 'gamma':... 0.8 ... 0.733333 0.866667 0.933333 0.800000 0.714286 0.857143 0.642857 0.794762 0.084016 5

5 rows × 22 columns

Model best parameters by grid search

In [ ]:
model_svc.get().best_params_
Out[ ]:
{'C': 3.0706122448979594, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}

Model evaluation: Visualized

In [ ]:
evaluate_model(model_svc)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x7fb29e113a00>

Random Forest Classifier¶

In [ ]:
model_rfc = ProcessModel(
    dataframe = df_transformed, model = RandomForestClassifier(),
    model_name = "Random Forest Classifier",
    model_param_grid = {
        'bootstrap': [True, False],
        'max_depth': list(np.arange(0, 101, 10)) + [None],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        'n_estimators': np.arange(10, 100, 15)}, )

Grid search and cross validation result

In [ ]:
pd.DataFrame(model_rfc.get().cv_results_).sort_values(by = 'rank_test_score').head()
Out[ ]:
mean_fit_time std_fit_time mean_score_time std_score_time param_bootstrap param_max_depth param_max_features param_min_samples_leaf param_min_samples_split param_n_estimators ... split3_test_score split4_test_score split5_test_score split6_test_score split7_test_score split8_test_score split9_test_score mean_test_score std_test_score rank_test_score
441 0.109441 0.003113 0.009728 0.001314 True 40 auto 1 5 55 ... 0.866667 0.866667 0.933333 0.733333 0.857143 0.857143 0.714286 0.836190 0.087370 1
2293 0.051826 0.001404 0.007464 0.001135 False 90 auto 2 5 25 ... 0.800000 0.866667 0.933333 0.666667 0.785714 0.928571 0.714286 0.829524 0.102105 2
2070 0.022671 0.001007 0.004405 0.000536 False 70 auto 2 2 10 ... 0.800000 0.800000 0.933333 0.866667 0.785714 0.857143 0.714286 0.829048 0.065099 3
569 0.163284 0.002472 0.012836 0.001111 True 50 auto 2 5 85 ... 0.866667 0.866667 0.866667 0.733333 0.785714 0.857143 0.714286 0.829048 0.083094 3
280 0.165139 0.001798 0.013227 0.001063 True 20 sqrt 1 5 70 ... 0.866667 0.800000 0.933333 0.733333 0.785714 0.857143 0.714286 0.829048 0.088280 3

5 rows × 24 columns

Model best parameters by grid search

In [ ]:
model_rfc.get().best_params_
Out[ ]:
{'bootstrap': True,
 'max_depth': 40,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 55}

Model evaluation: Visualized

In [ ]:
evaluate_model(model_rfc)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x7fb29c2a47f0>

Extreme gradient boosting¶

In [ ]:
model_xgboost = ProcessModel(
    dataframe = df_transformed, model = XGBClassifier(verbosity = 0),
    model_name = "Extreme Gradient Boosting",
    model_param_grid = {
        'learning_rate': np.arange(0.05, 0.35, 0.05),
        'max_depth': np.arange(3, 16),
        'min_child_weight': np.arange(1, 8, 2),
        'gamma': np.arange(0, 0.5, 0.1),
        'colsample_bytree': np.arange(0.3, 0.8, 0.1)}, )

Grid search and cross validation result

In [ ]:
pd.DataFrame(model_xgboost.get().cv_results_).sort_values(by = 'rank_test_score').head()
Out[ ]:
mean_fit_time std_fit_time mean_score_time std_score_time param_colsample_bytree param_gamma param_learning_rate param_max_depth param_min_child_weight params ... split3_test_score split4_test_score split5_test_score split6_test_score split7_test_score split8_test_score split9_test_score mean_test_score std_test_score rank_test_score
1560 0.177718 0.110570 0.009497 0.003545 0.4 0.0 0.05 3 1 {'colsample_bytree': 0.4, 'gamma': 0.0, 'learn... ... 0.866667 0.866667 0.933333 0.733333 0.785714 0.857143 0.642857 0.828571 0.100000 1
2184 0.478101 0.261755 0.017851 0.005470 0.4 0.2 0.05 3 1 {'colsample_bytree': 0.4, 'gamma': 0.2, 'learn... ... 0.866667 0.866667 0.933333 0.733333 0.785714 0.857143 0.642857 0.821905 0.099459 2
1872 0.241102 0.184433 0.013217 0.006992 0.4 0.1 0.05 3 1 {'colsample_bytree': 0.4, 'gamma': 0.1, 'learn... ... 0.866667 0.866667 0.933333 0.733333 0.785714 0.857143 0.642857 0.821905 0.099459 2
2808 0.184585 0.131049 0.016801 0.009318 0.4 0.4 0.05 3 1 {'colsample_bytree': 0.4, 'gamma': 0.4, 'learn... ... 0.866667 0.866667 0.933333 0.733333 0.785714 0.857143 0.642857 0.821905 0.108027 2
4060 0.069630 0.006532 0.005894 0.000238 0.5 0.3 0.05 4 1 {'colsample_bytree': 0.5, 'gamma': 0.300000000... ... 0.800000 0.866667 0.933333 0.733333 0.785714 0.857143 0.642857 0.821905 0.099459 2

5 rows × 23 columns

Model best parameters by grid search

In [ ]:
model_xgboost.get().best_params_
Out[ ]:
{'colsample_bytree': 0.4,
 'gamma': 0.0,
 'learning_rate': 0.05,
 'max_depth': 3,
 'min_child_weight': 1}

Model evaluation: Visualized

In [ ]:
evaluate_model(model_xgboost)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x7fb2a37641f0>

ROC-AUC Evaluation¶

The Receiver Operating Characteristic (ROC) curve and Area Under the Curve (AUC), a performance measurement for classification, tells us how much the models are capable of distinguishing between classes. With the higher area being the better in predicting 0 classes as 0 and 1 classes as 1.

Making a dataframe composed of the models' ROC elements. Specifically, the false positive rates and true positive rates.

In [ ]:
def make_dataframe(model) -> pd.DataFrame:
    model_name = model.get_model_name()
    model_roc_curve = model.get_roc_curve()
    model_auc = model.get_auc()

    df = pd.DataFrame(dict(
        model = (f"{model_name.replace(' ', '_')}:{model_auc:.3f} ".split() * len(model_roc_curve['thresholds'])),
        fpr = model_roc_curve['fpr'], tpr = model_roc_curve['tpr'], ))

    df['model'].replace({'_': ' ', ':': ': '}, regex = True, inplace = True)

    return df
In [ ]:
def make_roc_auc_dataframe_from_model_list(
    model_list: list[Type[ProcessModel]], model_names: dict = None,
    ) -> pd.DataFrame:

    model_names = {} if model_names is None else model_names

    df_dict = {}
    for model in model_list:
        df_dict.update({model.get_model_name(): make_dataframe(model)})

    df_dict_key_firstmodel = list(df_dict.keys())[0]
    df = df_dict[df_dict_key_firstmodel]
    for model_name, dataframe in df_dict.items():
        if model_name == df_dict_key_firstmodel:
            continue
        df = df.append(dataframe).reset_index(drop = True)

    df['model'].replace(model_names, regex = True, inplace = True)

    return df

With Support Vector Classifier's area under the curve value of 0.793 and it having a greater area than the other models, in choosing between the five, SVC is therefore prominent model.

In [ ]:
plot_lineplot(
    data = make_roc_auc_dataframe_from_model_list(
        model_list = [model_logreg, model_knn, model_svc, model_rfc, model_xgboost],
        model_names = {
            'Logistic Regression': 'logreg', 'K-Nearest Neighbors': 'knn',
            'Support Vector Classifier': 'svc', 'Random Forest Classifier': 'rfc',
            'Extreme Gradient Boosting': 'xgboost'}),
    x = 'fpr', y = 'tpr', hue = 'model', style = 'model',
    err_style = 'band', title = 'ROC-AUC Evaluation: ROC Curve by Models',
    x_label = 'False Positive Rate', y_label = 'True Positive Rate'
);